import pandas as pd
shots = pd.read_csv("/Users/ben/Desktop/shots_2022.csv")
skaters = pd.read_csv("/Users/ben/Desktop/skaters.csv")
goalies = pd.read_csv("/Users/ben/Desktop/goalies.csv")
shots_sm = shots[["shotID", "shooterName", "shooterPlayerId", "team", "goalieNameForShot", "goalieIdForShot","shotDistance", "shotType", "shotAngleAdjusted", "xCord", "yCord", "shooterTimeOnIce", "shotOnEmptyNet", "shotRebound", "shootingTeamForwardsOnIce", "shootingTeamDefencemenOnIce", "defendingTeamForwardsOnIce", "defendingTeamDefencemenOnIce", "timeSinceFaceoff", "shotWasOnGoal","goal"]]
shots_sm.head()
| shotID | shooterName | shooterPlayerId | team | goalieNameForShot | goalieIdForShot | shotDistance | shotType | shotAngleAdjusted | xCord | ... | shooterTimeOnIce | shotOnEmptyNet | shotRebound | shootingTeamForwardsOnIce | shootingTeamDefencemenOnIce | defendingTeamForwardsOnIce | defendingTeamDefencemenOnIce | timeSinceFaceoff | shotWasOnGoal | goal | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | Timo Meier | 8478414.0 | AWAY | Juuse Saros | 8477424 | 45.705580 | WRIST | 10.080598 | 44 | ... | 23 | 0 | 0 | 3 | 2 | 3 | 2 | 23 | 1.0 | 0 |
| 1 | 1 | Marc-Edouard Vlasic | 8471709.0 | AWAY | Juuse Saros | 8477424 | 52.478567 | WRIST | 30.963757 | 44 | ... | 12 | 0 | 0 | 3 | 2 | 3 | 2 | 12 | 0.0 | 0 |
| 2 | 2 | Mattias Ekholm | 8475218.0 | HOME | James Reimer | 8473503 | 56.568542 | SLAP | 8.130102 | -33 | ... | 35 | 0 | 0 | 3 | 2 | 3 | 2 | 35 | 1.0 | 0 |
| 3 | 3 | Kiefer Sherwood | 8480748.0 | HOME | James Reimer | 8473503 | 15.811388 | WRIST | 18.434949 | -74 | ... | 37 | 0 | 1 | 3 | 2 | 3 | 2 | 37 | 1.0 | 1 |
| 4 | 4 | Colton Sissons | 8476925.0 | HOME | James Reimer | 8473503 | 17.000000 | WRIST | 61.927513 | -81 | ... | 11 | 0 | 0 | 3 | 2 | 3 | 2 | 11 | 1.0 | 0 |
5 rows × 21 columns
shots_sm_og = shots_sm.loc[shots_sm["shotOnEmptyNet"] == 0]
shots_sm_og = shots_sm_og.loc[shots_sm["shotWasOnGoal"] == 1]
shots_sm_og["shootingPlayersOnIce"] = shots_sm_og["shootingTeamForwardsOnIce"] + shots_sm_og["shootingTeamDefencemenOnIce"]
shots_sm_og["defendingPlayersOnIce"] = shots_sm_og["defendingTeamForwardsOnIce"] + shots_sm_og["defendingTeamDefencemenOnIce"]
shots_sm_og = shots_sm_og.loc[shots_sm_og["defendingPlayersOnIce"] < 6]
shots_sm_og["PP"] = ((shots_sm_og["shootingPlayersOnIce"] > shots_sm_og["defendingPlayersOnIce"]) & (shots_sm_og["shootingPlayersOnIce"] < 6)).astype(int)
shots_sm_og["PK"] = (shots_sm_og["shootingPlayersOnIce"] < shots_sm_og["defendingPlayersOnIce"]).astype(int)
shots_sm_og["EA"] = (shots_sm_og["shootingPlayersOnIce"] == 6).astype(int)
skaters_sm = skaters[["playerId", "name", "I_F_goals", "I_F_shotsOnGoal", "situation"]]
skaters_sm_all = skaters_sm.loc[skaters_sm["situation"] == "all"]
skaters_sm_all["shotPct"] = skaters_sm_all["I_F_goals"] / skaters_sm_all["I_F_shotsOnGoal"] * 100
/var/folders/0h/sq3yj08505x_drk02vswsmpc0000gn/T/ipykernel_1633/3456088992.py:1: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy skaters_sm_all["shotPct"] = skaters_sm_all["I_F_goals"] / skaters_sm_all["I_F_shotsOnGoal"] * 100
skaters_sm_all.head()
| playerId | name | I_F_goals | I_F_shotsOnGoal | situation | shotPct | |
|---|---|---|---|---|---|---|
| 1 | 8471817 | Ryan Reaves | 5.0 | 48.0 | all | 10.416667 |
| 6 | 8480950 | Ilya Lyubushkin | 2.0 | 41.0 | all | 4.878049 |
| 11 | 8475625 | Matt Irwin | 2.0 | 79.0 | all | 2.531646 |
| 16 | 8480860 | Kevin Bahl | 2.0 | 27.0 | all | 7.407407 |
| 21 | 8477952 | Robby Fabbri | 7.0 | 35.0 | all | 20.000000 |
goalies_sm = goalies[["playerId", "name", "goals", "ongoal", "situation"]]
goalies_sm_all = goalies_sm.loc[goalies_sm["situation"] == "all"]
goalies_sm_all["savePct"] = (1-(goalies_sm_all["goals"] / goalies_sm_all["ongoal"])) * 100
/var/folders/0h/sq3yj08505x_drk02vswsmpc0000gn/T/ipykernel_1633/1365213038.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy goalies_sm_all["savePct"] = (1-(goalies_sm_all["goals"] / goalies_sm_all["ongoal"])) * 100
goalies_sm_all.head()
| playerId | name | goals | ongoal | situation | savePct | |
|---|---|---|---|---|---|---|
| 1 | 8478048 | Igor Shesterkin | 144.0 | 1719.0 | all | 91.623037 |
| 6 | 8478872 | Karel Vejmelka | 168.0 | 1671.0 | all | 89.946140 |
| 11 | 8479979 | Jake Oettinger | 144.0 | 1777.0 | all | 91.896455 |
| 16 | 8477968 | Alex Nedeljkovic | 49.0 | 466.0 | all | 89.484979 |
| 21 | 8477967 | Thatcher Demko | 99.0 | 1005.0 | all | 90.149254 |
shots_sm_og = shots_sm_og.merge(skaters_sm_all[["playerId", "shotPct"]], left_on="shooterPlayerId", right_on="playerId", how="left")
shots_sm_og = shots_sm_og.merge(goalies_sm_all[["playerId", "savePct"]], left_on="goalieIdForShot", right_on="playerId", how="left")
shots_sm_og.head()
| shotID | shooterName | shooterPlayerId | team | goalieNameForShot | goalieIdForShot | shotDistance | shotType | shotAngleAdjusted | xCord | ... | goal | shootingPlayersOnIce | defendingPlayersOnIce | PP | PK | EA | playerId_x | shotPct | playerId_y | savePct | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | Timo Meier | 8478414.0 | AWAY | Juuse Saros | 8477424 | 45.705580 | WRIST | 10.080598 | 44 | ... | 0 | 5 | 5 | 0 | 0 | 0 | 8478414.0 | 12.195122 | 8477424 | 91.853263 |
| 1 | 2 | Mattias Ekholm | 8475218.0 | HOME | James Reimer | 8473503 | 56.568542 | SLAP | 8.130102 | -33 | ... | 0 | 5 | 5 | 0 | 0 | 0 | 8475218.0 | 6.666667 | 8473503 | 88.988764 |
| 2 | 3 | Kiefer Sherwood | 8480748.0 | HOME | James Reimer | 8473503 | 15.811388 | WRIST | 18.434949 | -74 | ... | 1 | 5 | 5 | 0 | 0 | 0 | 8480748.0 | 12.280702 | 8473503 | 88.988764 |
| 3 | 4 | Colton Sissons | 8476925.0 | HOME | James Reimer | 8473503 | 17.000000 | WRIST | 61.927513 | -81 | ... | 0 | 5 | 5 | 0 | 0 | 0 | 8476925.0 | 13.793103 | 8473503 | 88.988764 |
| 4 | 6 | Steven Lorentz | 8478904.0 | AWAY | Juuse Saros | 8477424 | 17.117243 | TIP | 6.709837 | 72 | ... | 0 | 5 | 5 | 0 | 0 | 0 | 8478904.0 | 9.009009 | 8477424 | 91.853263 |
5 rows × 30 columns
shots_sm_og.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 87042 entries, 0 to 87041 Data columns (total 30 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 shotID 87042 non-null int64 1 shooterName 87039 non-null object 2 shooterPlayerId 87039 non-null float64 3 team 87042 non-null object 4 goalieNameForShot 87042 non-null object 5 goalieIdForShot 87042 non-null int64 6 shotDistance 87042 non-null float64 7 shotType 86349 non-null object 8 shotAngleAdjusted 87042 non-null float64 9 xCord 87042 non-null int64 10 yCord 87042 non-null int64 11 shooterTimeOnIce 87042 non-null int64 12 shotOnEmptyNet 87042 non-null int64 13 shotRebound 87042 non-null int64 14 shootingTeamForwardsOnIce 87042 non-null int64 15 shootingTeamDefencemenOnIce 87042 non-null int64 16 defendingTeamForwardsOnIce 87042 non-null int64 17 defendingTeamDefencemenOnIce 87042 non-null int64 18 timeSinceFaceoff 87042 non-null int64 19 shotWasOnGoal 87042 non-null float64 20 goal 87042 non-null int64 21 shootingPlayersOnIce 87042 non-null int64 22 defendingPlayersOnIce 87042 non-null int64 23 PP 87042 non-null int64 24 PK 87042 non-null int64 25 EA 87042 non-null int64 26 playerId_x 87014 non-null float64 27 shotPct 87014 non-null float64 28 playerId_y 87042 non-null int64 29 savePct 87042 non-null float64 dtypes: float64(7), int64(19), object(4) memory usage: 20.6+ MB
shots_sm_og.shotType.value_counts()
WRIST 47445 SNAP 14115 SLAP 10048 BACK 6850 TIP 5716 DEFL 1395 WRAP 780 Name: shotType, dtype: int64
from sklearn.impute import SimpleImputer
imputeShots = SimpleImputer(strategy='most_frequent')
shots_sm_og[["shotType"]] = imputeShots.fit_transform(shots_sm_og[["shotType"]])
/opt/anaconda3/lib/python3.8/site-packages/scipy/__init__.py:146: UserWarning: A NumPy version >=1.16.5 and <1.23.0 is required for this version of SciPy (detected version 1.24.2
warnings.warn(f"A NumPy version >={np_minversion} and <{np_maxversion}"
shots_sm_og.shotType.value_counts()
WRIST 48138 SNAP 14115 SLAP 10048 BACK 6850 TIP 5716 DEFL 1395 WRAP 780 Name: shotType, dtype: int64
shots_sm_og[shots_sm_og["shotPct"].isnull()]
| shotID | shooterName | shooterPlayerId | team | goalieNameForShot | goalieIdForShot | shotDistance | shotType | shotAngleAdjusted | xCord | ... | goal | shootingPlayersOnIce | defendingPlayersOnIce | PP | PK | EA | playerId_x | shotPct | playerId_y | savePct | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1856 | 2528 | NaN | NaN | AWAY | Ilya Samsonov | 8478492 | 26.627054 | WRIST | 34.286877 | 67 | ... | 0 | 5 | 5 | 0 | 0 | 0 | NaN | NaN | 8478492 | 91.891892 |
| 1887 | 2578 | NaN | NaN | AWAY | Ilya Samsonov | 8478492 | 9.899495 | WRIST | 45.000000 | 82 | ... | 0 | 5 | 5 | 0 | 0 | 0 | NaN | NaN | 8478492 | 91.891892 |
| 3060 | 4209 | NaN | NaN | HOME | Jeremy Swayman | 8480280 | 9.433981 | BACK | 32.005383 | -81 | ... | 1 | 5 | 5 | 0 | 0 | 0 | NaN | NaN | 8480280 | 92.025184 |
| 13952 | 19313 | Darcy Kuemper | 8475311.0 | HOME | Andrei Vasilevskiy | 8476883 | 12.041595 | SNAP | 4.763642 | 77 | ... | 0 | 5 | 5 | 0 | 0 | 0 | NaN | NaN | 8476883 | 91.520000 |
| 33564 | 46672 | Jonathan Quick | 8471734.0 | AWAY | Karel Vejmelka | 8478872 | 8.062258 | WRIST | 7.125016 | 81 | ... | 0 | 4 | 5 | 0 | 1 | 0 | NaN | NaN | 8478872 | 89.946140 |
| 35796 | 49784 | Alex Stalock | 8471774.0 | AWAY | Joonas Korpisalo | 8476914 | 6.324555 | WRIST | 18.434949 | 83 | ... | 0 | 4 | 5 | 0 | 1 | 0 | NaN | NaN | 8476914 | 91.383596 |
| 83119 | 116350 | Mackenzie MacEachern | 8476907.0 | AWAY | Ilya Sorokin | 8478009 | 24.186773 | BACK | 82.874984 | -86 | ... | 0 | 5 | 5 | 0 | 0 | 0 | NaN | NaN | 8478009 | 92.387167 |
| 83127 | 116360 | Mackenzie MacEachern | 8476907.0 | AWAY | Ilya Sorokin | 8478009 | 28.017851 | WRIST | 2.045408 | -61 | ... | 1 | 5 | 5 | 0 | 0 | 0 | NaN | NaN | 8478009 | 92.387167 |
| 83844 | 117392 | Tye Kartye | 8481789.0 | AWAY | Alexandar Georgiev | 8480382 | 17.492856 | SNAP | 30.963757 | -74 | ... | 1 | 5 | 5 | 0 | 0 | 0 | NaN | NaN | 8480382 | 91.811024 |
| 83967 | 117561 | Igor Shesterkin | 8478048.0 | AWAY | Akira Schmid | 8481033 | 8.062258 | WRIST | 7.125016 | -81 | ... | 0 | 5 | 5 | 0 | 0 | 0 | NaN | NaN | 8481033 | 92.156863 |
| 84079 | 117714 | Mackenzie MacEachern | 8476907.0 | AWAY | Ilya Sorokin | 8478009 | 18.027756 | TIP | 19.440035 | 72 | ... | 0 | 5 | 5 | 0 | 0 | 0 | NaN | NaN | 8478009 | 92.387167 |
| 84098 | 117740 | Mackenzie MacEachern | 8476907.0 | AWAY | Ilya Sorokin | 8478009 | 9.055385 | DEFL | 6.340192 | 80 | ... | 0 | 5 | 5 | 0 | 0 | 0 | NaN | NaN | 8478009 | 92.387167 |
| 84271 | 117997 | Tye Kartye | 8481789.0 | HOME | Alexandar Georgiev | 8480382 | 35.846897 | WRIST | 22.988717 | 56 | ... | 0 | 5 | 5 | 0 | 0 | 0 | NaN | NaN | 8480382 | 91.811024 |
| 84755 | 118691 | Tye Kartye | 8481789.0 | AWAY | Jake Oettinger | 8479979 | 51.478151 | WRIST | 29.054604 | 44 | ... | 0 | 5 | 5 | 0 | 0 | 0 | NaN | NaN | 8479979 | 91.896455 |
| 84822 | 118789 | Tye Kartye | 8481789.0 | AWAY | Jake Oettinger | 8479979 | 13.928388 | TIP | 21.037511 | -76 | ... | 0 | 5 | 5 | 0 | 0 | 0 | NaN | NaN | 8479979 | 91.896455 |
| 85026 | 119106 | Tye Kartye | 8481789.0 | AWAY | Jake Oettinger | 8479979 | 37.121422 | WRIST | 27.255328 | -56 | ... | 1 | 5 | 5 | 0 | 0 | 0 | NaN | NaN | 8479979 | 91.896455 |
| 85464 | 119726 | Mackenzie MacEachern | 8476907.0 | AWAY | Akira Schmid | 8481033 | 33.241540 | WRIST | 46.218875 | -66 | ... | 0 | 5 | 5 | 0 | 0 | 0 | NaN | NaN | 8481033 | 92.156863 |
| 85469 | 119732 | Mackenzie MacEachern | 8476907.0 | AWAY | Akira Schmid | 8481033 | 42.190046 | SLAP | 31.429566 | -53 | ... | 0 | 5 | 5 | 0 | 0 | 0 | NaN | NaN | 8481033 | 92.156863 |
| 85501 | 119783 | Tye Kartye | 8481789.0 | HOME | Jake Oettinger | 8479979 | 58.821765 | WRIST | 17.818889 | -33 | ... | 0 | 5 | 5 | 0 | 0 | 0 | NaN | NaN | 8479979 | 91.896455 |
| 85511 | 119801 | Tye Kartye | 8481789.0 | HOME | Jake Oettinger | 8479979 | 33.941125 | WRIST | 45.000000 | 65 | ... | 0 | 5 | 5 | 0 | 0 | 0 | NaN | NaN | 8479979 | 91.896455 |
| 85641 | 119985 | Mackenzie MacEachern | 8476907.0 | HOME | Akira Schmid | 8481033 | 5.000000 | TIP | 53.130102 | 86 | ... | 0 | 5 | 5 | 0 | 0 | 0 | NaN | NaN | 8481033 | 92.156863 |
| 85704 | 120079 | Tye Kartye | 8481789.0 | AWAY | Jake Oettinger | 8479979 | 36.891733 | WRIST | 57.171458 | 69 | ... | 0 | 5 | 5 | 0 | 0 | 0 | NaN | NaN | 8479979 | 91.896455 |
| 85737 | 120122 | Tye Kartye | 8481789.0 | AWAY | Jake Oettinger | 8479979 | 33.421550 | SLAP | 51.072456 | 68 | ... | 0 | 5 | 5 | 0 | 0 | 0 | NaN | NaN | 8479979 | 91.896455 |
| 85914 | 120353 | Tye Kartye | 8481789.0 | HOME | Jake Oettinger | 8479979 | 43.046487 | WRIST | 30.735488 | 52 | ... | 0 | 5 | 4 | 1 | 0 | 0 | NaN | NaN | 8479979 | 91.896455 |
| 85927 | 120369 | Tye Kartye | 8481789.0 | HOME | Jake Oettinger | 8479979 | 31.780497 | WRIST | 24.145542 | -60 | ... | 1 | 5 | 5 | 0 | 0 | 0 | NaN | NaN | 8479979 | 91.896455 |
| 86026 | 120523 | Tye Kartye | 8481789.0 | AWAY | Jake Oettinger | 8479979 | 8.246211 | TIP | 14.036243 | 81 | ... | 0 | 6 | 5 | 0 | 0 | 1 | NaN | NaN | 8479979 | 91.896455 |
| 86034 | 120538 | Tye Kartye | 8481789.0 | AWAY | Jake Oettinger | 8479979 | 59.682493 | WRIST | 31.293039 | -38 | ... | 0 | 5 | 5 | 0 | 0 | 0 | NaN | NaN | 8479979 | 91.896455 |
| 86381 | 121054 | Mackenzie MacEachern | 8476907.0 | AWAY | Sergei Bobrovsky | 8475683 | 30.479501 | SLAP | 41.009087 | 66 | ... | 0 | 5 | 5 | 0 | 0 | 0 | NaN | NaN | 8475683 | 90.109141 |
28 rows × 30 columns
tye_pct = ((shots_sm_og["goal"]==1) & (shots_sm_og["shooterPlayerId"]==8481789)).sum() / (shots_sm_og["shooterPlayerId"]==8481789).sum()
mac_pct = ((shots_sm_og["goal"]==1) & (shots_sm_og["shooterPlayerId"]==8476907)).sum() / (shots_sm_og["shooterPlayerId"]==8476907).sum()
shots_sm_og.loc[shots_sm_og["shooterPlayerId"] == 8481789, "shotPct"] = shots_sm_og.loc[shots_sm_og["shooterPlayerId"] == 8481789, "shotPct"].fillna(tye_pct)
shots_sm_og.loc[shots_sm_og["shooterPlayerId"] == 8476907, "shotPct"] = shots_sm_og.loc[shots_sm_og["shooterPlayerId"] == 8476907, "shotPct"].fillna(mac_pct)
shots_sm_og["shotPct"] = shots_sm_og.apply(lambda player: 0 if player['shooterPlayerId'] in [8475311,8471734,8471774,8478048] else player['shotPct'], axis=1)
imputePct = SimpleImputer(strategy='median')
shots_sm_og[["shotPct"]] = imputePct.fit_transform(shots_sm_og[["shotPct"]])
from sklearn.preprocessing import OneHotEncoder
shot_encoder = OneHotEncoder()
shot_1hot = shot_encoder.fit_transform(shots_sm_og[['shotType']])
shot_1hot = shot_encoder.transform(shots_sm_og[['shotType']]).toarray()
outs = pd.DataFrame(shot_1hot, columns = shot_encoder.get_feature_names_out(), index = shots_sm_og.index)
shots_sm_og = shots_sm_og.merge(outs, left_index=True, right_index=True)
shots_sm_og.head()
| shotID | shooterName | shooterPlayerId | team | goalieNameForShot | goalieIdForShot | shotDistance | shotType | shotAngleAdjusted | xCord | ... | shotPct | playerId_y | savePct | shotType_BACK | shotType_DEFL | shotType_SLAP | shotType_SNAP | shotType_TIP | shotType_WRAP | shotType_WRIST | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | Timo Meier | 8478414.0 | AWAY | Juuse Saros | 8477424 | 45.705580 | WRIST | 10.080598 | 44 | ... | 12.195122 | 8477424 | 91.853263 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
| 1 | 2 | Mattias Ekholm | 8475218.0 | HOME | James Reimer | 8473503 | 56.568542 | SLAP | 8.130102 | -33 | ... | 6.666667 | 8473503 | 88.988764 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 2 | 3 | Kiefer Sherwood | 8480748.0 | HOME | James Reimer | 8473503 | 15.811388 | WRIST | 18.434949 | -74 | ... | 12.280702 | 8473503 | 88.988764 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
| 3 | 4 | Colton Sissons | 8476925.0 | HOME | James Reimer | 8473503 | 17.000000 | WRIST | 61.927513 | -81 | ... | 13.793103 | 8473503 | 88.988764 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
| 4 | 6 | Steven Lorentz | 8478904.0 | AWAY | Juuse Saros | 8477424 | 17.117243 | TIP | 6.709837 | 72 | ... | 9.009009 | 8477424 | 91.853263 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 |
5 rows × 37 columns
#!pip install hockey-rink
from hockey_rink import NHLRink
import matplotlib.pyplot as plt
import seaborn as sns
rink=NHLRink()
my_colors = ['#87cefa', '#ff0000']
fig, axs = plt.subplots(1, 1, figsize=(16, 12))
rink.plot_fn(sns.scatterplot, x="xCord", y="yCord", hue="goal", legend=True, data=shots_sm_og, ax=axs,palette=my_colors);
axs.set_title('Goal or Save?', fontsize=18)
axs.legend(title="Goal", bbox_to_anchor=(0.02, 0.98))
<matplotlib.legend.Legend at 0x7f89740334f0>
goal_sm_og = shots_sm_og.loc[shots_sm_og["goal"] ==1]
fig1, ax1 = plt.subplots(1, 1, figsize=(10,5))
sns.kdeplot(data=shots_sm_og, x="shotDistance", hue="goal", ax=ax1)
<AxesSubplot: xlabel='shotDistance', ylabel='Density'>
rink2=NHLRink()
fig, axs = plt.subplots(1, 1, figsize=(16, 12))
rink.plot_fn(sns.scatterplot, x="xCord", y="yCord", hue="shotType", legend=True, data=goal_sm_og, ax=axs);
axs.set_title('What scores?', fontsize=14)
axs.legend(title="Shot Type", bbox_to_anchor=(0.02, 0.98))
<matplotlib.legend.Legend at 0x7f8971dd0b20>
shots_limited = shots_sm_og[["shotDistance", "shotAngleAdjusted", "shooterTimeOnIce","timeSinceFaceoff", "shotPct", "savePct", "shotRebound", "PP", "PK", "EA", "shotType_BACK", "shotType_DEFL", "shotType_SLAP", "shotType_SNAP", "shotType_TIP", "shotType_WRIST", "shotType_WRAP", "goal"]]
goal_limited = goal_sm_og[["shotDistance", "shotAngleAdjusted", "shooterTimeOnIce","timeSinceFaceoff", "shotPct", "savePct", "shotRebound", "PP", "PK", "EA", "shotType_BACK", "shotType_DEFL", "shotType_SLAP", "shotType_SNAP", "shotType_TIP", "shotType_WRIST", "shotType_WRAP"]]
shots_limited.describe()
| shotDistance | shotAngleAdjusted | shooterTimeOnIce | timeSinceFaceoff | shotPct | savePct | shotRebound | PP | PK | EA | shotType_BACK | shotType_DEFL | shotType_SLAP | shotType_SNAP | shotType_TIP | shotType_WRIST | shotType_WRAP | goal | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 87042.000000 | 87042.000000 | 87042.000000 | 87042.000000 | 87042.000000 | 87042.000000 | 87042.000000 | 87042.000000 | 87042.000000 | 87042.000000 | 87042.000000 | 87042.000000 | 87042.000000 | 87042.000000 | 87042.000000 | 87042.000000 | 87042.000000 | 87042.000000 |
| mean | 32.735800 | 31.537386 | 33.272570 | 63.568117 | 10.074992 | 90.485140 | 0.074906 | 0.148135 | 0.028791 | 0.019519 | 0.078698 | 0.016027 | 0.115439 | 0.162163 | 0.065669 | 0.553043 | 0.008961 | 0.095609 |
| std | 19.566176 | 21.011407 | 23.021861 | 58.674182 | 4.590280 | 1.407890 | 0.263242 | 0.355236 | 0.167219 | 0.138342 | 0.269268 | 0.125579 | 0.319552 | 0.368602 | 0.247705 | 0.497181 | 0.094239 | 0.294056 |
| min | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 78.571429 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 15.297059 | 14.796762 | 17.000000 | 21.000000 | 6.666667 | 89.462366 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 50% | 30.886890 | 29.744881 | 29.000000 | 46.000000 | 10.317460 | 90.455342 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 |
| 75% | 46.529560 | 45.000000 | 44.000000 | 87.000000 | 13.095238 | 91.520000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 |
| max | 98.412398 | 88.408860 | 437.000000 | 542.000000 | 100.000000 | 100.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
goal_limited.describe()
| shotDistance | shotAngleAdjusted | shooterTimeOnIce | timeSinceFaceoff | shotPct | savePct | shotRebound | PP | PK | EA | shotType_BACK | shotType_DEFL | shotType_SLAP | shotType_SNAP | shotType_TIP | shotType_WRIST | shotType_WRAP | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 8322.000000 | 8322.000000 | 8322.000000 | 8322.000000 | 8322.000000 | 8322.000000 | 8322.000000 | 8322.000000 | 8322.000000 | 8322.000000 | 8322.000000 | 8322.000000 | 8322.000000 | 8322.000000 | 8322.000000 | 8322.000000 | 8322.000000 |
| mean | 20.959803 | 27.947622 | 36.109349 | 64.912761 | 12.125559 | 90.292355 | 0.146359 | 0.216414 | 0.027758 | 0.023792 | 0.082432 | 0.028238 | 0.090123 | 0.194424 | 0.108147 | 0.489546 | 0.007090 |
| std | 13.684099 | 20.231000 | 24.881158 | 58.605555 | 4.366189 | 1.434339 | 0.353487 | 0.411825 | 0.164288 | 0.152411 | 0.275038 | 0.165663 | 0.286374 | 0.395781 | 0.310585 | 0.499921 | 0.083906 |
| min | 1.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 78.571429 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 9.848858 | 10.575672 | 19.000000 | 24.000000 | 9.411765 | 89.375685 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 50% | 17.464249 | 26.565051 | 31.000000 | 48.000000 | 12.107623 | 90.208078 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 75% | 29.120440 | 41.633539 | 47.000000 | 87.000000 | 14.925373 | 91.412742 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 |
| max | 91.923882 | 88.090848 | 239.000000 | 516.000000 | 100.000000 | 95.833333 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
corrs = shots_limited.corr()
plt.figure(figsize=(12, 10))
sns.heatmap(corrs, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
<AxesSubplot: >
corrs['goal'].sort_values(ascending=False)
goal 1.000000 shotPct 0.145247 shotRebound 0.088255 PP 0.062495 shotType_TIP 0.055757 shooterTimeOnIce 0.040064 shotType_DEFL 0.031618 shotType_SNAP 0.028458 EA 0.010043 timeSinceFaceoff 0.007451 shotType_BACK 0.004509 PK -0.002008 shotType_WRAP -0.006457 shotType_SLAP -0.025759 shotType_WRIST -0.041526 savePct -0.044522 shotAngleAdjusted -0.055550 shotDistance -0.195689 Name: goal, dtype: float64
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(shots_limited[shots_limited.columns.difference(['goal'])], shots_limited[['goal']], test_size=.3, random_state=55)
cont_vars = X_train[["shotPct", "savePct", "shooterTimeOnIce", "timeSinceFaceoff", "shotAngleAdjusted", "shotDistance"]]
fig, ax1 = plt.subplots(1,1, figsize=(8, 8))
cont_vars.hist(bins=50, ax=ax1)
/var/folders/0h/sq3yj08505x_drk02vswsmpc0000gn/T/ipykernel_1633/31952167.py:5: UserWarning: To output multiple subplots, the figure containing the passed axes is being cleared. cont_vars.hist(bins=50, ax=ax1)
array([[<AxesSubplot: title={'center': 'shotPct'}>,
<AxesSubplot: title={'center': 'savePct'}>],
[<AxesSubplot: title={'center': 'shooterTimeOnIce'}>,
<AxesSubplot: title={'center': 'timeSinceFaceoff'}>],
[<AxesSubplot: title={'center': 'shotAngleAdjusted'}>,
<AxesSubplot: title={'center': 'shotDistance'}>]], dtype=object)
import numpy as np
fig, ax1 = plt.subplots(1,2, figsize=(10,5))
np.log(X_train['shooterTimeOnIce']+0.001).hist(ax=ax1[0], bins=50);
np.log(X_train['timeSinceFaceoff']+0.001).hist(ax=ax1[1], bins=50);
X_train['shooterTimeOnIce'] = np.log(X_train['shooterTimeOnIce']+0.001)
X_train['timeSinceFaceoff'] = np.log(X_train['timeSinceFaceoff']+0.001)
X_test['shooterTimeOnIce'] = np.log(X_test['shooterTimeOnIce']+0.001)
X_test['timeSinceFaceoff'] = np.log(X_test['timeSinceFaceoff']+0.001)
from sklearn.preprocessing import StandardScaler
std_scaler = StandardScaler()
X_train_subset = cont_vars
X_train_scaled = std_scaler.fit_transform(X_train_subset)
X_train_subset_std_scaled = std_scaler.fit_transform(X_train_subset)
X_train[["shotPct", "savePct", "shooterTimeOnIce", "timeSinceFaceoff", "shotAngleAdjusted", "shotDistance"]] = pd.DataFrame(X_train_subset_std_scaled, columns = X_train_subset.columns,index = X_train_subset.index)
X_train.head()
| EA | PK | PP | savePct | shooterTimeOnIce | shotAngleAdjusted | shotDistance | shotPct | shotRebound | shotType_BACK | shotType_DEFL | shotType_SLAP | shotType_SNAP | shotType_TIP | shotType_WRAP | shotType_WRIST | timeSinceFaceoff | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 5173 | 0 | 0 | 0 | -2.024079 | -1.227792 | 0.642202 | -1.020543 | -0.379338 | 1 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | -0.997459 |
| 32201 | 0 | 0 | 0 | -0.638946 | 0.205064 | -0.897123 | 0.422415 | -1.691086 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | -0.435061 |
| 32552 | 0 | 0 | 1 | 0.146350 | 0.812942 | 0.254980 | 0.116185 | 0.634295 | 0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | -0.418018 |
| 48917 | 0 | 0 | 0 | -0.785110 | -0.446234 | 0.151401 | -0.863168 | -0.309667 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | -0.349849 |
| 72006 | 0 | 0 | 0 | 2.473370 | -0.663333 | -1.010266 | -0.528904 | 1.090338 | 0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | -0.043086 |
X_test_subset = X_test[["shotPct", "savePct", "shooterTimeOnIce", "timeSinceFaceoff", "shotAngleAdjusted", "shotDistance"]]
X_test_subset_std_scaled = std_scaler.transform(X_test_subset)
X_test[["shotPct", "savePct", "shooterTimeOnIce", "timeSinceFaceoff", "shotAngleAdjusted", "shotDistance"]] = pd.DataFrame(X_test_subset_std_scaled, columns = X_test_subset.columns, index = X_test_subset.index)
y_train = np.ravel(y_train)
y_test = np.ravel(y_test)
from sklearn.tree import DecisionTreeClassifier
tree_goal = DecisionTreeClassifier(max_depth=4, random_state=55)
tree_goal.fit(X_train, y_train)
DecisionTreeClassifier(max_depth=4, random_state=55)
from sklearn.metrics import accuracy_score
y_preds = tree_goal.predict(X_train)
accuracy_score(y_train, y_preds)
0.9032316302581693
from sklearn.model_selection import cross_val_score
tree_cval = cross_val_score(tree_goal, X_train, y_train, scoring="accuracy", cv=10)
print(tree_cval)
print(tree_cval.mean())
[0.90316757 0.90300345 0.90316757 0.90316757 0.90316757 0.90316757 0.90316757 0.90333169 0.90333169 0.90331582] 0.9031988070872152
from sklearn.model_selection import GridSearchCV
params = {'min_samples_leaf': range(1,30)}
goal_tree2 = DecisionTreeClassifier(random_state=55, max_depth=5)
grid_search = GridSearchCV(goal_tree2, params, cv=10)
grid_search.fit(X_train, y_train)
GridSearchCV(cv=10,
estimator=DecisionTreeClassifier(max_depth=5, random_state=55),
param_grid={'min_samples_leaf': range(1, 30)})
print(grid_search.best_params_)
{'min_samples_leaf': 11}
plt.plot(params['min_samples_leaf'], grid_search.cv_results_['mean_test_score'])
plt.xlabel('min_samples_leaf')
plt.ylabel('Accuracy')
Text(0, 0.5, 'Accuracy')
new_goal_tree = grid_search.best_estimator_
y_pred_new = new_goal_tree.predict(X_train)
accuracy_new = accuracy_score(y_train, y_pred_new)
print(accuracy_new)
0.9032152177124194
from sklearn.ensemble import AdaBoostClassifier
ada_goals = AdaBoostClassifier(new_goal_tree, n_estimators=100, learning_rate=0.1, random_state=55)
ada_goals.fit(X_train, y_train)
y_pred_ada = ada_goals.predict(X_train)
accuracy_score(y_train, y_pred_ada)
0.9062679512219141
params = {'learning_rate': [0.05,0.1,0.15,0.2]}
grid_search = GridSearchCV(ada_goals, params, cv=10, scoring='accuracy')
grid_search.fit(X_train, y_train)
GridSearchCV(cv=10,
estimator=AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=5,
min_samples_leaf=11,
random_state=55),
learning_rate=0.1, n_estimators=100,
random_state=55),
param_grid={'learning_rate': [0.05, 0.1, 0.15, 0.2]},
scoring='accuracy')
print(grid_search.best_params_)
{'learning_rate': 0.05}
new_ada_tree = grid_search.best_estimator_
y_pred_ada = new_ada_tree.predict(X_train)
accuracy_ada_new = accuracy_score(y_train, y_pred_ada)
print(accuracy_ada_new)
0.9038388944509183
for name, i in zip(X_train.columns, new_goal_tree.feature_importances_):
print(name, ': ', i)
EA : 0.0 PK : 0.0 PP : 0.040341948544053714 savePct : 0.013827500566485301 shooterTimeOnIce : 0.0059124998231809495 shotAngleAdjusted : 0.12038247955995461 shotDistance : 0.695956299104606 shotPct : 0.11240210850683287 shotRebound : 0.0 shotType_BACK : 0.0 shotType_DEFL : 0.0 shotType_SLAP : 0.007534107171950436 shotType_SNAP : 0.0036430567229361636 shotType_TIP : 0.0 shotType_WRAP : 0.0 shotType_WRIST : 0.0 timeSinceFaceoff : 0.0
for name, i in zip(X_train.columns, new_ada_tree.feature_importances_):
print(name, ': ', i)
EA : 0.0001924223117769841 PK : 0.0018404260113325638 PP : 0.015999958086972896 savePct : 0.08014211540385649 shooterTimeOnIce : 0.0783315053800441 shotAngleAdjusted : 0.1307280361205362 shotDistance : 0.37985259668452215 shotPct : 0.13551465139609717 shotRebound : 0.02558183601666707 shotType_BACK : 0.016160256980068795 shotType_DEFL : 0.002281060637928967 shotType_SLAP : 0.02662590400192699 shotType_SNAP : 0.019745782969128517 shotType_TIP : 0.0005698674142425813 shotType_WRAP : 0.00728335096731764 shotType_WRIST : 0.008577021769295501 timeSinceFaceoff : 0.07057320784828519
goal_probs2 = new_goal_tree.predict_proba(X_train)
goal_probs2[:20]
array([[0.82948082, 0.17051918],
[0.9397646 , 0.0602354 ],
[0.89585666, 0.10414334],
[0.82948082, 0.17051918],
[0.78362175, 0.21637825],
[0.82948082, 0.17051918],
[0.9397646 , 0.0602354 ],
[0.97430681, 0.02569319],
[0.96605505, 0.03394495],
[0.9397646 , 0.0602354 ],
[0.89503954, 0.10496046],
[0.82948082, 0.17051918],
[0.82948082, 0.17051918],
[0.98815331, 0.01184669],
[0.95685786, 0.04314214],
[0.99104255, 0.00895745],
[0.86525173, 0.13474827],
[0.97430681, 0.02569319],
[0.9009009 , 0.0990991 ],
[0.97430681, 0.02569319]])
goal_probs = new_ada_tree.predict_proba(X_train)
goal_probs[:20]
array([[0.55049976, 0.44950024],
[0.73126839, 0.26873161],
[0.59801078, 0.40198922],
[0.60121438, 0.39878562],
[0.55167713, 0.44832287],
[0.59790198, 0.40209802],
[0.63887083, 0.36112917],
[0.64703371, 0.35296629],
[0.98704891, 0.01295109],
[0.64231724, 0.35768276],
[0.56518995, 0.43481005],
[0.55868153, 0.44131847],
[0.57805599, 0.42194401],
[0.70731577, 0.29268423],
[0.59930876, 0.40069124],
[0.94695786, 0.05304214],
[0.58394953, 0.41605047],
[0.69710819, 0.30289181],
[0.59721175, 0.40278825],
[0.67372198, 0.32627802]])
from sklearn.linear_model import LogisticRegression
lr_goals = LogisticRegression(random_state=55, penalty='l2')
lr_goals.fit(X_train, y_train)
y_pred_lr = lr_goals.predict(X_train)
accuracy_score(y_train, y_pred_lr)
0.9031003298921696
logit_cval = cross_val_score(lr_goals, X_train, y_train, scoring="accuracy", cv=10)
print(logit_cval)
print(logit_cval.mean())
[0.90316757 0.90316757 0.90300345 0.90316757 0.90300345 0.9026752 0.9026752 0.90333169 0.90333169 0.90282337] 0.9030346762411771
params2 = {'C': [0.01, 0.05, 0.1, 0.5, 1, 2, 5, 10]}
lr_goals2 = LogisticRegression(random_state=55, penalty='l2')
grid_search2 = GridSearchCV(lr_goals2, params2, cv=10)
grid_search2.fit(X_train, y_train)
GridSearchCV(cv=10, estimator=LogisticRegression(random_state=55),
param_grid={'C': [0.01, 0.05, 0.1, 0.5, 1, 2, 5, 10]})
print(grid_search2.best_params_)
{'C': 0.01}
plt.plot(params2['C'], grid_search2.cv_results_['mean_test_score'])
plt.xlabel('C value')
plt.ylabel('Accuracy')
Text(0, 0.5, 'Accuracy')
new_lr = grid_search2.best_estimator_
y_pred_new2 = new_lr.predict(X_train)
accuracy2 = accuracy_score(y_train, y_pred_new2)
print(accuracy2)
0.9031003298921696
coefs = new_lr.coef_[0]
for name, i in zip(X_train.columns, coefs):
print(name, ': ', i)
EA : 0.01651959207904577 PK : 0.039312667986854434 PP : 0.2087794388034147 savePct : -0.14060866947201187 shooterTimeOnIce : 0.04484789108338324 shotAngleAdjusted : -0.2710889952392761 shotDistance : -0.8566600801761866 shotPct : 0.3031009210467715 shotRebound : 0.21590914038438105 shotType_BACK : -0.34250215201218936 shotType_DEFL : -0.035054291629563944 shotType_SLAP : 0.40193814231948227 shotType_SNAP : 0.3180347525796388 shotType_TIP : -0.10353018828102824 shotType_WRAP : -0.20873921500606452 shotType_WRIST : -0.02901464053786811 timeSinceFaceoff : 0.025887300998685366
goal_prob_lr = new_lr.predict_proba(X_train)
goal_prob_lr[:20]
array([[0.84618963, 0.15381037],
[0.96279138, 0.03720862],
[0.88508012, 0.11491988],
[0.88329133, 0.11670867],
[0.84343135, 0.15656865],
[0.89487228, 0.10512772],
[0.95179045, 0.04820955],
[0.92289458, 0.07710542],
[0.99207819, 0.00792181],
[0.9544164 , 0.0455836 ],
[0.86148671, 0.13851329],
[0.70518141, 0.29481859],
[0.83739787, 0.16260213],
[0.97329371, 0.02670629],
[0.87649937, 0.12350063],
[0.99671583, 0.00328417],
[0.87453032, 0.12546968],
[0.97266819, 0.02733181],
[0.90961015, 0.09038985],
[0.97924473, 0.02075527]])
totalTree = sum(prob[1] for prob in goal_probs2)
totalBoosting = sum(prob[1] for prob in goal_probs)
totalLR = sum(prob[1] for prob in goal_prob_lr)
totalTrain = sum(y_train)
print(totalTrain)
print(totalTree)
print(totalBoosting)
print(totalLR)
5897 5897.000000000744 20918.38594849597 5897.2342367711435
y_pred_last = new_lr.predict(X_test)
accuracy_last = accuracy_score(y_test, y_pred_last)
print(accuracy_last)
0.9070960824110597
test_prob_lr = new_lr.predict_proba(X_test)
testLR = sum(prob[1] for prob in test_prob_lr)
totalTest = sum(y_test)
print(totalTest)
print(testLR)
2425 2323.50132013164
from sklearn.metrics import confusion_matrix
# [TN, FP]
# [FN, TP]
cm = confusion_matrix(y_test, y_pred_last)
print(cm)
[[23687 1] [ 2425 0]]
TP = cm[1, 1]
FP = cm[0, 1]
TN = cm[0, 0]
FN = cm[1, 0]
FPR = FP / (FP + TN)
TPR = TP / (TP + FN)
Precision = TP / (TP + FP)
Recall = TPR
from sklearn.metrics import roc_curve
FPR, TPR, thresholds = roc_curve(y_test, test_prob_lr[:,1])
fig, ax = plt.subplots(1, 1, figsize=(12,8))
ax.step(FPR, TPR, linewidth=2)
ax.plot([0,1], [0,1], '--', color = 'black')
fs = 12
ax.set_xlabel('1 - Specificity = 1 - TNR = FPR', fontsize=fs)
ax.set_ylabel('Sensitivity = TPR', fontsize=fs)
ax.tick_params(axis='both', labelsize=fs)
AUC = np.trapz(TPR, FPR)
ax.set_title('AUC = ' + str(round(AUC,4)))
Text(0.5, 1.0, 'AUC = 0.749')
Y = TPR
X = FPR
#Get closest to upper left
d_min = 999999999
for k in range(len(thresholds)):
if k < len(X) and k < len(Y):
#Euclidean distance to upper left:
d = np.sqrt((0 - X[k])**2 + (1 - Y[k])**2)
if (d < d_min):
d_min = d
k_opt = k
#Optimal threshold value?
k_opt = thresholds[k_opt]
print(k_opt)
0.0953697417247127
from sklearn.metrics import precision_recall_curve
fig, ax = plt.subplots(1, 1, figsize=(12,8))
Precision, Recall, thresholds = precision_recall_curve(y_test, test_prob_lr[:,1])
ax.step(Recall, Precision, linewidth=2)
ax.grid()
fs = 12
ax.set_xlabel('Recall (Sensitivity)', fontsize=fs)
ax.set_ylabel('Precision (PPV)', fontsize=fs)
ax.tick_params(axis='both', labelsize=fs)
ax.set_title('Precision-Recall Curve')
ax.set_ylim([0, 1])
ax.set_xlim([0, 1])
(0.0, 1.0)
from sklearn.preprocessing import StandardScaler
std_scaler = StandardScaler()
format_probs = (test_prob_lr[:,1]).reshape(-1,1)
scaled_goal_probs = std_scaler.fit_transform(format_probs)
print(scaled_goal_probs)
[[-0.54715272] [ 0.88201908] [-1.08462157] ... [ 0.07497453] [-0.60673238] [-0.69432077]]
X_test_heat = X_test.copy()
X_test_heat['new_Probs'] = scaled_goal_probs
X_test_heat['old_Probs'] = test_prob_lr[:,1]
X_heat_wcoords = pd.merge(X_test_heat, shots_sm_og[['xCord', 'yCord', 'shotType']], left_index=True, right_index=True)
X_heat_wcoords.head()
| EA | PK | PP | savePct | shooterTimeOnIce | shotAngleAdjusted | shotDistance | shotPct | shotRebound | shotType_BACK | ... | shotType_SNAP | shotType_TIP | shotType_WRAP | shotType_WRIST | timeSinceFaceoff | new_Probs | old_Probs | xCord | yCord | shotType | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 74558 | 0 | 0 | 1 | -1.182744 | -1.340771 | 0.268163 | 0.442855 | -0.214660 | 0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 1.0 | -1.041803 | -0.547153 | 0.050260 | -56 | -25 | WRIST |
| 56817 | 0 | 0 | 0 | -0.140935 | -1.261046 | -0.925015 | -0.939397 | 0.224482 | 0 | 0.0 | ... | 0.0 | 1.0 | 0.0 | 0.0 | -1.010511 | 0.882019 | 0.151395 | -75 | 3 | TIP |
| 70387 | 0 | 0 | 0 | -1.589026 | -1.375001 | -0.623043 | 2.848966 | 0.352954 | 0 | 0.0 | ... | 1.0 | 0.0 | 0.0 | 0.0 | -1.011529 | -1.084622 | 0.012226 | -5 | 28 | SNAP |
| 64098 | 0 | 0 | 0 | 0.301050 | -1.286946 | -0.866443 | 0.322954 | 0.349951 | 0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 1.0 | -1.043428 | -0.426143 | 0.058823 | 51 | 9 | WRIST |
| 20016 | 0 | 0 | 0 | -0.765072 | -1.277717 | -0.383756 | 1.278603 | -1.560726 | 0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 1.0 | -1.007159 | -1.038069 | 0.015520 | -36 | -23 | WRIST |
5 rows × 22 columns
rink3=NHLRink()
fig, axs = plt.subplots(1, 1, figsize=(16, 12))
rink3.plot_fn(sns.scatterplot, x="xCord", y="yCord", hue="new_Probs", legend=False, data=X_heat_wcoords, ax=axs, palette="inferno");
axs.set_title('Goal Probs by Location', fontsize=18)
Text(0.5, 1.0, 'Goal Probs by Location')
rink4=NHLRink()
fig, axs = plt.subplots(1, 1, figsize=(12, 8))
rink4.heatmap(
"xCord", "yCord", "new_Probs", data=X_heat_wcoords,
binsize=4, fill_value=0, plot_xlim=(25, 89), cmap="inferno", vmax=0.6,
ax=axs, draw_kw={"display_range": "ozone"},)
<matplotlib.collections.QuadMesh at 0x7f897dd433a0>
high_chance = X_heat_wcoords[X_heat_wcoords['old_Probs'] > .15]
high_chance.head()
| EA | PK | PP | savePct | shooterTimeOnIce | shotAngleAdjusted | shotDistance | shotPct | shotRebound | shotType_BACK | ... | shotType_SNAP | shotType_TIP | shotType_WRAP | shotType_WRIST | timeSinceFaceoff | new_Probs | old_Probs | xCord | yCord | shotType | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 56817 | 0 | 0 | 0 | -0.140935 | -1.261046 | -0.925015 | -0.939397 | 0.224482 | 0 | 0.0 | ... | 0.0 | 1.0 | 0.0 | 0.0 | -1.010511 | 0.882019 | 0.151395 | -75 | 3 | TIP |
| 19489 | 0 | 0 | 0 | 1.348018 | -1.303423 | -1.050393 | -1.359700 | 0.634295 | 1 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 1.0 | -1.033411 | 2.201093 | 0.244739 | 83 | 1 | WRIST |
| 18537 | 0 | 0 | 0 | -0.403612 | -1.290517 | -0.235821 | -1.327779 | 0.569522 | 1 | 1.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | -1.037694 | 1.402885 | 0.188254 | -83 | 3 | BACK |
| 10237 | 0 | 0 | 1 | -0.638946 | -1.278651 | -1.113845 | -0.948364 | 0.823895 | 0 | 0.0 | ... | 0.0 | 1.0 | 0.0 | 0.0 | -1.017421 | 2.000320 | 0.230531 | 75 | -2 | TIP |
| 30169 | 0 | 0 | 0 | 0.611079 | -1.288104 | -1.253665 | -0.542682 | 0.317377 | 0 | 0.0 | ... | 1.0 | 0.0 | 0.0 | 0.0 | -1.021132 | 1.053946 | 0.163561 | 67 | -2 | SNAP |
5 rows × 22 columns
rink5=NHLRink()
fig, axs = plt.subplots(1, 1, figsize=(16, 12))
rink5.plot_fn(sns.scatterplot, x="xCord", y="yCord", hue="shotType", legend=True, data=high_chance, ax=axs);
axs.legend(title="Shot Type", bbox_to_anchor=(0.02, 0.98))
<matplotlib.legend.Legend at 0x7f897dd60f70>
-Hockey is not a discrete sport
-Shots, even at their best, are low probability events
-This type of model may need different framing: goal probability models might be valuable in the aggregate, but are less so at the individual shot level
-There is a spatial relationship in goal scoring that outweighs game and personal statistics
-In the future: new features (shot speed, breakaway, goalie screening, etc.), interaction terms, feature selection